import pandas as pd
import numpy as np
from langdetect import detect as langdetect
import random
seed = 42
np.random.seed(seed)
random.seed(seed)
import sys
sys.path.append("..")
from data.labeled.raw import CWA as cwa, CWR as cwr
from data.labeled.preprocessed import LABELED_CITIES_PATH, RISKS_MAPPING_PATH, SEVERITY_MAPPING_PATH, IMPUTATION_REPORT_PATH
cwa.head()
cwr.head()
# Convert columns names to lowercase
cwa.columns = cwa.columns.str.lower().str.replace(" ", "_").str.replace("'","")
cwr.columns = cwr.columns.str.lower().str.replace(" ", "_").str.replace("'","")
cwr.isnull().sum()
cwa.isnull().sum()
id_columns = list(set(cwr.columns) & set(cwa.columns))
id_columns
cwr[id_columns].isnull().sum()
cwa[id_columns].isnull().sum()
cwr['c40'] = cwr['c40'] == 'C40'
cwr[id_columns] = cwr[id_columns].fillna('nan')
cwa['c40'] = cwa['c40'] == 'C40'
cwa[id_columns] = cwa[id_columns].fillna('nan')
len(cwa[id_columns]), len(cwa[id_columns].drop_duplicates())
len(cwr[id_columns]), len(cwr[id_columns].drop_duplicates())
cwa_agg = cwa.groupby(id_columns).agg(set).reset_index()
cwr_agg = cwr.groupby(id_columns).agg(set).reset_index()
len(cwa_agg[id_columns]), len(cwa_agg[id_columns].drop_duplicates())
len(cwr_agg[id_columns]), len(cwr_agg[id_columns].drop_duplicates())
cwr_agg.magnitude = cwr_agg.magnitude.apply(lambda x: 'Extremely serious' if 'Extremely serious' in x else ('Serious' if 'Serious' in x else list(x)[0]))
cwr_agg.magnitude.unique()
filling the non-merged rows with {np.nan} on list_columns to make sure there is contingency in each column
cw_data = pd.merge(cwa_agg, cwr_agg, on=id_columns, how='outer')
list_columns = cw_data.iloc[0].apply(lambda x: isinstance(x, set))
list_columns = list_columns[list_columns].index
cw_data[list_columns] = cw_data[list_columns].applymap(lambda x: {np.nan} if pd.isna(x) else x)
len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())
cw_data.head()
cw_data['city_location'] = cw_data['city_location'].apply(eval)
cw_data['latitude'] = cw_data['city_location'].apply(lambda x: x[0])
cw_data['longitude'] = cw_data['city_location'].apply(lambda x: x[1])
id_columns = [x for x in id_columns] + ['latitude','longitude']
len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())
cw_value_columns = [ x for x in cw_data.columns if x not in id_columns]
# Values that can be converted back to strings from sets
singular_cols = cw_data.set_index(id_columns)[list_columns].applymap(lambda x: len(x)==1).product() > 0
singular_cols
len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())
# magnitude is nicely singular per id
cw_data.loc[:, singular_cols[singular_cols].index] = cw_data.loc[:, singular_cols[singular_cols].index].applymap(lambda x: list(x)[0])
list_columns = [x for x in list_columns if x not in singular_cols[singular_cols].index]
also convert sets to lists
def try_lang_detect_except(x):
try:
return (langdetect(x)=='en')
except:
return False
cw_data = cw_data.applymap(lambda x: [t for t in x if not pd.isnull(t) and (t!='') and (t!='ERROR: #NAME?') and try_lang_detect_except(t)] if isinstance(x,set) else x)
len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())
cw_data.risks_to_citys_water_supply.unique()
cw_data.risks_to_citys_water_supply = cw_data.risks_to_citys_water_supply.apply(lambda x: x.replace('Inadequate or ageing infrastructure','Inadequate or aging infrastructure'))
cw_data['risks_to_citys_water_supply'] = cw_data['risks_to_citys_water_supply'].apply(lambda x: x.replace('Declining water quality: Declining water quality', 'Declining water quality'))
cw_data.risks_to_citys_water_supply.unique()
len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())
cw_data.drop_duplicates(subset=id_columns,inplace=True)
cw_data.head()
to_drop = ['timescale','current_population','population_year']
cw_data['population_year'] = cw_data['population_year'].fillna(0)
t = cw_data.sort_values('population_year' ).groupby([x for x in cw_data.columns if (x !='population_year') and x not in list_columns],dropna=False).last().reset_index()
cw_data
cw_data.drop(columns=['timescale','population_year'],inplace=True)
cw_data.rename(columns={'current_population':'population'},inplace=True)
len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())
Create value_columns that contains the names of all the list columns + the created ones
value_columns = set([x for x in cw_value_columns if x not in to_drop] + ['risks_to_citys_water_supply'])
list_columns = cw_data.iloc[0].apply(lambda x: isinstance(x, list))
list_columns = list_columns[list_columns].index
len_columns = set()
for x in value_columns.copy():
if x in list_columns:
cw_data[x+'_n'] = cw_data[x].apply(lambda x: len(x) if x else 1)
len_columns.add(x+'_n')
value_columns = value_columns.union(len_columns)
value_columns
len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())
We are making the assumption that, given that those cities participate in a specific program, they will also have better structured data. So this subset will be used as a reference for the imputation
c40_data = cw_data[cw_data['c40']].copy()
c40_data.shape
list_nulls = c40_data[list_columns].applymap(lambda x: len(x) == 0)
list_nulls.mean()
c40_data['magnitude'].unique()
c40_risks = c40_data[id_columns+['magnitude']].copy()
c40_risks['low'] = c40_risks['magnitude'] == 'Less Serious'
c40_risks['medium'] = c40_risks['magnitude'] == 'Serious'
c40_risks['high'] = c40_risks['magnitude'] == 'Extremely serious'
c40_risks['unknown'] = c40_risks['magnitude'].isna()
c40_risks = c40_risks.drop(columns=['magnitude']).groupby([x for x in id_columns if x!="risks_to_citys_water_supply"]).agg(sum).reset_index()
c40_risks
from sklearn.preprocessing import LabelEncoder
from utils.nlp import SimilarityAnalysis, create_sim_vector
c40_data.risks_to_citys_water_supply = c40_data.risks_to_citys_water_supply.apply(lambda x: np.nan if x=='nan' else x)
c40_data[c40_data.risks_to_citys_water_supply.isnull()]
c40_data.risks_to_citys_water_supply.unique()
per string list column of the value columns, by concatenating strings together. Vectorize all the string list columns based on their in between similarities
analyses = {x:SimilarityAnalysis() for x in value_columns if x in list_columns}
vectorized = np.hstack([analyses[x].fit_transform(c40_data[x]) for x in analyses])
cols_to_encode = [x for x in value_columns if x not in list_columns and x not in len_columns]
c40_data[cols_to_encode] = c40_data[cols_to_encode].fillna('nan')
encoders = {x:LabelEncoder() for x in cols_to_encode}
encoded = np.array([encoders[x].fit_transform(c40_data[x]) for x in cols_to_encode]).T.astype(float)
for t in range(len(cols_to_encode)):
encoded[encoded[:,t] == np.where(encoders[cols_to_encode[t]].classes_=='nan')[0],t] = np.nan
lab_freqs = c40_data[cols_to_encode].groupby(cols_to_encode,as_index=False).size()
lab_freqs
whole = np.hstack([encoded, vectorized, c40_data[len_columns]])
import sys
sys.path.append('..')
from labeled_preprocessing.imputation import LabeledDatasetImputer
(LabeledDatasetImputer is a union of two Iterative Imputers , one for labels and one for features)
imputer = LabeledDatasetImputer(verbose=2,k_features_per_label=0, seed=seed)
continuous_imputed, y = imputer.fit_transform(whole[:,2:], whole[:,:2], ret_imputed_x=True)
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
simple_model = make_pipeline(StandardScaler(), SVC(kernel='linear', C=0.01, random_state=seed))
scoring = 'accuracy'
scores = cross_val_score(simple_model, continuous_imputed, y[:,cols_to_encode.index('magnitude')], cv=5,scoring=scoring)
np.mean(scores)
import matplotlib.pyplot as plt
flg = np.all(~np.isnan(whole),axis=1), np.hstack([[True, True], imputer.selection_mask[:]])
corr = np.corrcoef(whole[flg[0],:][:,flg[1]].T)
print(corr.shape)
plt.matshow(corr)
imputed = np.array([
encoders[x].inverse_transform(
y[:, c].astype(int)
) for c,x in enumerate(cols_to_encode)]).T
c40_data_imputed = c40_data.copy()
c40_data_imputed[cols_to_encode] = imputed
c40_data_imputed.head()
imp_lab_freqs = c40_data_imputed[cols_to_encode].groupby(cols_to_encode,as_index=False).size()
imp_ret = pd.merge(lab_freqs, imp_lab_freqs,suffixes=('','_imp'), how='right',on=cols_to_encode)
imp_ret['increase'] = (imp_ret['size_imp'] - imp_ret['size'])/imp_ret['size']
imp_ret
set([x for x in cw_data['risks_to_citys_water_supply'] if x not in encoders['risks_to_citys_water_supply'].classes_])
cw_data['risks_to_citys_water_supply'].value_counts()
based on fitted encoders and SimilarityAnalysis objects
cw_data[cols_to_encode] = cw_data[cols_to_encode].fillna('nan')
encoded = np.array([encoders[x].transform(cw_data[x]) for x in cols_to_encode]).T.astype(float)
for t in range(len(cols_to_encode)):
encoded[encoded[:,t] == np.where(encoders[cols_to_encode[t]].classes_=='nan')[0],t] = np.nan
all_vectorized = np.hstack([analyses[x].transform(cw_data[x]) for x in value_columns if x in list_columns])
using the trained imputer
all_imputed_x, all_imputed_y = imputer.transform(np.hstack([all_vectorized,cw_data[len_columns]]), encoded, ret_imputed_x=True)
all_imputed_y_dec = np.array([
encoders[x].inverse_transform(
all_imputed_y[:, c].astype(int)
) for c,x in enumerate(cols_to_encode)]).T
len(cw_data[id_columns]), len(cw_data[id_columns].drop_duplicates())
cw_data_imputed = cw_data.copy()
cw_data_imputed[cols_to_encode] = all_imputed_y_dec
cw_data_imputed.drop_duplicates(id_columns,inplace=True)
import matplotlib.pyplot as plt
corr = np.corrcoef(np.hstack([all_imputed_y, all_imputed_x]).T)
print(corr.shape)
plt.matshow(corr)
scores = cross_val_score(simple_model, all_imputed_x, all_imputed_y[:,cols_to_encode.index('magnitude')], cv=5,scoring=scoring)
np.mean(scores)
The score did not have significant drop, so we can assume that the imputation was cohesive across data
final_labeled_data = cw_data_imputed[['city','latitude','longitude','country','population','c40','magnitude','risks_to_citys_water_supply']].copy()
final_labeled_data.rename(columns={'magnitude': 'risk','risks_to_citys_water_supply': 'description'},inplace=True)
final_labeled_data[['city','latitude','longitude','c40', 'population','description','risk']]
cw_data_imputed[id_columns].drop_duplicates().shape
risks = final_labeled_data.description.unique()
risks_description = {risk: f'risk{c}' for c,risk in enumerate(risks)}
risks_df = pd.Series(risks_description).to_frame()
risks_df.reset_index(inplace=True)
risks_df.columns=['description','code']
final_labeled_data['description'] = final_labeled_data['description'].apply(lambda x: risks_description[x])
risks_df
main_index=['city','latitude','longitude', 'country']
and drop the ones that are not
c40_check = final_labeled_data.groupby(main_index)['c40'].nunique()
c40_check[c40_check>1]
final_labeled_data.city.nunique()
final_labeled_data[final_labeled_data.city == 'Santiago']
final_labeled_data = final_labeled_data.sort_values(by='c40').groupby(main_index + ['description']).last().reset_index()
final_labeled_data = final_labeled_data.drop(columns=['c40']).merge(final_labeled_data.groupby(main_index)['c40'].max().reset_index(),on=main_index)
final_labeled_data[final_labeled_data.city=='Santiago']
final_labeled_data = final_labeled_data.sort_values(by='c40').groupby(main_index + ['description']).last().reset_index()
pop_df = final_labeled_data.groupby(index)[['population']
].max().reset_index().drop_duplicates()
final_labeled_data = final_labeled_data.drop(columns=['population']).merge(pop_df.reset_index(), on=index)
final_labeled_data = final_labeled_data.pivot(index= index + ['c40','population'], columns='description', values='risk').reset_index()
severity_mapping = {'Less Serious':1, "Serious":2, 'Extremely serious':3}
pd.DataFrame([severity_mapping]).to_csv(SEVERITY_MAPPING_PATH,index=False)
nrisks = list(risks_description.values())
final_labeled_data[nrisks] = final_labeled_data[nrisks].replace(severity_mapping)
final_labeled_data.loc[final_labeled_data['c40'], nrisks] = final_labeled_data.loc[final_labeled_data['c40'], nrisks].fillna(0)
final_labeled_data.to_csv(LABELED_CITIES_PATH,index=False)
risks_df.to_csv(RISKS_MAPPING_PATH,index=False)
final_labeled_data.head()
cw_lab_freqs = cw_data[cols_to_encode].groupby(cols_to_encode,as_index=False).size()
imp_cw_lab_freqs = cw_data_imputed[cols_to_encode].groupby(cols_to_encode,as_index=False).size()
imp_cw_ret = pd.merge(cw_lab_freqs, imp_cw_lab_freqs,suffixes=('','_imp'), how='right',on=cols_to_encode)
imp_cw_ret['increase'] = (imp_cw_ret['size_imp'] - imp_cw_ret['size'])/imp_cw_ret['size']
imp_cw_ret
imp_report = pd.merge(imp_cw_ret, imp_ret, suffixes=('','_c40'), on=('magnitude','risks_to_citys_water_supply'))
imp_report.to_csv(IMPUTATION_REPORT_PATH,index=False)
imp_report